Rem
Rem $Header: olstype.sql 09-jun-2004.17:32:14 lvbcheng Exp $
Rem
Rem olstype.sql
Rem
Rem Copyright (c) 2004, Oracle. All rights reserved.  
Rem
Rem    NAME
Rem      olstype.sql - Ordinary Least Squares Regression
Rem
Rem    DESCRIPTION
Rem      This files contains the type definition for Ordinary Least Squares (OLS)
Rem      Regression: y = b0 + b1 z1 + ... + bn zn.
Rem
Rem    NOTES
Rem      The constructor of OLS_Regression requires the user to pass as argument,
Rem      amoung other values, a mean vector (MV) for the predictor variables, a
Rem      variance-covariance matrix (VCM) for the independent variables, and a
Rem      covariance vector (CV) for the dependent and independent variables. 
Rem
Rem      1. With n independent variables (zi) the mean vector (MV) should have n
Rem         entries, where MV[i] = avg(zi).
Rem
Rem      2. The VCM is an NxN symmetric matrix and is passed in as an array containing
Rem         just the upper triangular portion (i <= j) with the first row followed by
Rem         the second row followed by the third row etc. The entry V[i,j] is 
Rem         covar_pop(zi, zj) when i!=j and var_pop(zi) when i = j. So, for example,
Rem         if one has 3 predictor variables (a, b, and c) then conceptually the VCM
Rem         will be:
Rem
Rem                 [     var_pop(a)   covar_pop(a,b)   covar_pop(a,c) ]   
Rem          VCM => [ covar_pop(b,a)       var_pop(b)   covar_pop(b,c) ]   
Rem                 [ covar_pop(c,a)   covar_pop(c,b)       var_pop(c) ]   
Rem
Rem         But this should be passed to OLS_Regression() as:
Rem
Rem          VCM => UTL_NLA_ARRAY_DBL(var_pop(a), covar_pop(a,b), covar_pop(a,c),
Rem                                                   var_pop(b), covar_pop(b,c),
Rem                                                                   var_pop(c))
Rem
Rem      3. With n independent variables (zi) and one dependent variable (y), the
Rem         covariance vector (CV) should have n entries, where CV[i] = 
Rem         covar_pop(y, zi).
Rem      
Rem      4. The standardized score regression requires only a correlation matrix (RM)
Rem         and a correlation vector (RV). The RM is represented the same as the VCM
Rem         above, except RM[i,j] = corr(zi, zj) when i != j, and 1 when i = j. The 
Rem         RV is represented the same way as the CV above, except RV[i] = corr(y,zi).
Rem
Rem    DEPENDENCIES
Rem      This file depends on the UTL_NLA package.
Rem
Rem    REFERENCES
Rem      Johnson, Richard A., and Dean W. Wichern, "Applied Multivariate Statistical
Rem      Analysis (5th ed.)". New Jersey: Prentice Hall, 2002.
Rem
Rem    EXAMPLES
Rem      1. The following SQL statement will return the regression equation of the
Rem         form y = b0 + b1 z1 + b2 z2 for the table ols_data(y number, z1 number,
Rem         z2 number).
Rem
Rem         SELECT model.ols.getEquation() "OLS Regression Equation"
Rem         FROM
Rem         (SELECT OLS_Regression(AVG(y), VAR_POP(y), 
Rem                                UTL_NLA_ARRAY_DBL(AVG(z1),AVG(z2)),
Rem                                UTL_NLA_ARRAY_DBL(VAR_POP(z1),COVAR_POP(z1,z2),VAR_POP(z2)),
Rem                                UTL_NLA_ARRAY_DBL(COVAR_POP(y,z1),COVAR_POP(y,z2))) ols
Rem          FROM ols_data) model;
Rem
Rem      2. The following SQL statement will return the regression equation of the form
Rem               y = 0 + b1 z1 + b2 z2
Rem         for the table ols_data(y number, z1 number, z2 number)
Rem        
Rem         SELECT model.ols.getEquation() "OLS Regression Equation"
Rem         FROM
Rem         (SELECT OLS_Regression(AVG(y), VAR_POP(y), 
Rem                                UTL_NLA_ARRAY_DBL(AVG(z1),AVG(z2)),
Rem                                UTL_NLA_ARRAY_DBL(VAR_POP(z1),COVAR_POP(z1,z2),VAR_POP(z2)),
Rem                                UTL_NLA_ARRAY_DBL(COVAR_POP(y,z1),COVAR_POP(y,z2)), 0) ols
Rem          FROM ols_data) model;
Rem         
Rem      3. Further examples for OLS_Regressions can be found in olsexmpl.sql file.
Rem
Rem    MODIFIED   (MM/DD/YY)
Rem    lvbcheng    06/09/04 - lvbcheng_matrix_prototype
Rem    lvbcheng    05/07/04 - Swap info and pack 
Rem    lvbcheng    05/04/04 - info as out param 
Rem    lvbcheng    04/30/04 - 
Rem    achaudhr    03/30/04 - Created
Rem

SET ECHO ON
SET FEEDBACK 1
SET NUMWIDTH 10
SET LINESIZE 80
SET TRIMSPOOL ON
SET TAB OFF
SET PAGESIZE 100

drop type OLS_Regression
/

create or replace type OLS_Regression as object
(
    /** Private State **/

    alpha       number,
    betas       UTL_NLA_ARRAY_DBL,
    betaCount   number,
    correlation number,
    mse         number,

    /** Public Methods **/

    -- Constructor for Raw Score Regression
    constructor function OLS_Regression(
           -- Mean of the dependent variable
           mean_y      number,
           -- Variance of dependent variable
           variance_y  number,
           -- Mean Vector (MV) for the independent variables
           MV          UTL_NLA_ARRAY_DBL,
           -- Variance Covariance Matrix (VCM) in packed upper-triangular row-major form
           VCM         UTL_NLA_ARRAY_DBL,
           -- Covariance Vector (CV) for the independent and dependent variables
           CV          UTL_NLA_ARRAY_DBL,
           -- Optionally, set an intercept (b0 some number)
           intercept number default NULL
     ) return self as result,

    -- Constructor for Standardized Score Regression
    constructor function OLS_Regression(
           -- Correlation Matrix (RM) in packed upper-triangular row-major form
           RM UTL_NLA_ARRAY_DBL,
           -- Correlation Vector (RV) for the independent and dependent variables
           RV UTL_NLA_ARRAY_DBL
    ) return self as result,

    -- Returns the predicted value of y for given zi value (i in 1..n)
    member function predict(predictors UTL_NLA_ARRAY_DBL) return number,

    -- Returns the regression equation: y = b0 + b1 z1 + ... + bn zn
    member function getEquation (roundBy integer default 3) return varchar,

    -- Returns the specified regression coefficient (bn). 
    member function getCoefficient(n integer) return number,

    -- Returns the sum of the K-th power of the coefficients in the given range
    member function getCoeffSumK(first    integer default 1,
                                 last     integer default null,
                                 K        number  default 2) return number,

    -- Returns the (first) index of the largest regression coefficient in the
    -- index range 
    member function getCoeffMaxIdx(first integer default 1, 
                                   last  integer default null) return integer,

    -- Returns the (first) index of the smallest regression coefficient in the
    -- index range 
    member function getCoeffMinIdx(first integer default 1, 
                                   last  integer default null) return integer,

    -- Returns the mean square error
    member function getError return number,

    -- Returns the multiple correlation coefficient
    member function getCorrelation return number,

    -- Some tests for this object type
    static procedure unittest
);
/
show errors

create or replace type body OLS_Regression is 

    constructor function OLS_Regression(
           -- Mean of the dependent variable
           mean_y      number,
           -- Variance of dependent variable
           variance_y  number,
           -- Mean Vector (MV) for the independent variables
           MV          UTL_NLA_ARRAY_DBL,
           -- Variance Covariance Matrix (VCM) in packed upper-triangular row-major form
           VCM         UTL_NLA_ARRAY_DBL,
           -- Covariance Vector (CV) for the independent and dependent variables
           CV          UTL_NLA_ARRAY_DBL,
           -- Optionally, set an intercept (b0 some number)
           intercept number default NULL
    ) return  self as result is
       common               number;
       dotprd               number;
       DO_OPTIMIZE constant boolean := (intercept is null) and
                                       (MV.count < 4);
    begin  
      betaCount := MV.count;

      if (betaCount < 1 or CV.count != betaCount or
          VCM.count != (betaCount * (betaCount + 1)) / 2) then 
          raise_application_error(-20000, 'Error in arguments');
      end if;

      betas := UTL_NLA_ARRAY_DBL(0);
      if (betaCount > 1) then betas.extend(betaCount - 1, 1); end if;

      if (DO_OPTIMIZE != true) then
        declare
          s_zz UTL_NLA_ARRAY_DBL := UTL_NLA_ARRAY_DBL(0);
          s_yz UTL_NLA_ARRAY_DBL := UTL_NLA_ARRAY_DBL(0);
          perm UTL_NLA_ARRAY_INT := UTL_NLA_ARRAY_INT();
          info integer;
        begin
          if (VCM.count > 1) then s_zz.extend(VCM.count-1, 1); end if;
          if (CV.count  > 1) then s_yz.extend(CV.count-1,  1); end if;
          for i in 1 .. VCM.count  loop s_zz(i) := VCM(i); end loop;
          for i in 1 .. CV.count   loop s_yz(i) := CV(i);  end loop;

          if (intercept IS NOT NULL) then
            declare
              mean_y_double binary_double := mean_y;
              M_z UTL_NLA_ARRAY_DBL := UTL_NLA_ARRAY_DBL(0);
              M_z_copy UTL_NLA_ARRAY_DBL := UTL_NLA_ARRAY_DBL(0);
            begin
              M_z.extend(MV.count, 1);
              M_z_copy.extend(MV.count, 1);

              for i in 1..MV.count loop 
                M_z(i) := MV(i); 
                M_z_copy(i) := MV(i); 
              end loop;
              utl_nla.blas_axpy(n => betaCount, alpha => mean_y_double - intercept, 
                                x => M_z_copy, incx => 1, 
                                y => s_yz, incy => 1);
              utl_nla.blas_spr('U', betaCount, 1, M_z, 1, s_zz, 'R');
           end;
          end if;

          utl_nla.lapack_spsv(uplo => 'U', n => betaCount, nrhs => 1,
                              ap => s_zz, ipiv => perm,
                              b  => s_yz, ldb => 1, info => info, pack =>'R');
          if (info != 0) then goto collinearity_error; end if;
          for i in 1 .. betaCount loop betas(i) := s_yz(i); end loop;
        end;
      -- Optimize the cases when (1 <= betaCount <= 3)
      elsif (betaCount = 1) then 
        common := VCM(1);
        if (common = 0) then goto collinearity_error; end if;
        betas(1) := CV(1)/common;
      elsif (betaCount = 2) then 
        common := VCM(1)*VCM(3) - VCM(2) * VCM(2);
        if (common = 0) then goto collinearity_error; end if;
        betas(1) := (VCM(3)*CV(1) - VCM(2)*CV(2)) / common;
        betas(2) := (VCM(1)*CV(2) - VCM(2)*CV(1)) / common;
      else 
        common :=
       (VCM(3)*VCM(3)*VCM(4) - 
      2*VCM(2)*VCM(3)*VCM(5) +
        VCM(2)*VCM(2)*VCM(6) +
        VCM(1)*VCM(5)*VCM(5) -
        VCM(1)*VCM(4)*VCM(6));
        if (common = 0) then goto collinearity_error; end if;
        betas(1) :=
       (VCM(5)*VCM(5)*CV(1) -
        VCM(4)*VCM(6)*CV(1) +
        VCM(2)*VCM(6)*CV(2) +
        VCM(3)*VCM(4)*CV(3) -
        VCM(5)*VCM(3)*CV(2) -
        VCM(5)*VCM(2)*CV(3)) / common;
        betas(2) :=
       (VCM(2)*VCM(6)*CV(1) +
        VCM(3)*VCM(3)*CV(2) -
        VCM(1)*VCM(6)*CV(2) +
        VCM(1)*VCM(5)*CV(3) -
        VCM(3)*VCM(5)*CV(1) -
        VCM(3)*VCM(2)*CV(3)) / common;
        betas(3) :=
       (VCM(3)*VCM(4)*CV(1) -
        VCM(2)*VCM(5)*CV(1) -
        VCM(2)*VCM(3)*CV(2) +
        VCM(1)*VCM(5)*CV(2) +
        VCM(2)*VCM(2)*CV(3) -
        VCM(1)*VCM(4)*CV(3)) / common;
      end if;

      if (intercept is not null) then
        alpha  := intercept;
      else
        dotprd := utl_nla.blas_dot(betaCount, MV, 1, betas, 1);    
        alpha  := mean_y - dotprd;
      end if;

      dotprd      := utl_nla.blas_dot(betaCount, CV, 1, betas, 1);
      correlation := sqrt(dotprd/variance_y);
      mse         := variance_y - dotprd;

      return;

      <<collinearity_error>>
      raise_application_error(-20010, 'Collinearity error: Check for high correlation in independent variables');

      return;
    end;

    constructor function OLS_Regression(
           -- Correlation Matrix (RM) in packed upper-triangular row-major form
           RM UTL_NLA_ARRAY_DBL,
           -- Correlation Vector (RV) for the independent and dependent variables
           RV UTL_NLA_ARRAY_DBL
    ) return self as result is
       ols OLS_Regression;
       MV  UTL_NLA_ARRAY_DBL := UTL_NLA_ARRAY_DBL(0);
    begin
      if (RV.count > 1) then MV.extend(RV.count-1, 1); end if;
      ols := OLS_Regression(0, 1, MV, RM, RV);
      self.alpha       := 0;
      self.betas       := ols.betas;
      self.betaCount   := ols.betaCount;
      self.correlation := ols.correlation;
      self.mse         := ols.mse;

      return;
    end;

    member function predict(predictors UTL_NLA_ARRAY_DBL)
    return number is
      rval number := 0;
    begin
      if (betaCount != predictors.count) then return null; end if;
      for i in 1 .. betaCount loop rval:= rval + betas(i)*predictors(i); end loop;
      return alpha + rval;
    end;

    member function getEquation(roundBy integer default 3)
    return varchar is 
       eqn varchar(1000) := 'y = ';
    begin
        if (betaCount is null) then return null; end if;
        eqn := eqn || round(alpha, roundBy);
        for s in 1 .. betaCount loop
           if (betas(s) < 0) then
            eqn := eqn || ' - ' || round(abs(betas(s)), roundBy) || ' z' || s;
           else
            eqn := eqn || ' + ' || round(abs(betas(s)), roundBy) || ' z' || s;
           end if;
        end loop;
        return eqn;
    end;

    member function getCoefficient(n integer) 
    return number is
    begin 
      if (betaCount is null or n < 0 or n > betaCount) then return null; end if;
      if (n = 0) then return alpha; end if;
      return betas(n);
    end;

    member function getCoeffSumK(first    integer default 1,
                                 last     integer default null, 
                                 K        number  default 2)
    return number is
       first_ integer := first;
       last_  integer := last;
       rval   binary_double := 0;
    begin 
       if (first_ is null or first_ < 0)         then first_ := 1;         end if;
       if (last_  is null or last_  > betaCount) then last_  := betaCount; end if;
       if (betaCount is null or first_ > last_)  then return null; end if;
       if (first_ = 0) then rval := power(alpha, K); first_ := 1;  end if; 
       for s in first_ .. last_ loop rval := rval + power(betas(s), K); end loop;
       return rval;
    end;

    member function getCoeffMaxIdx(first integer default 1,
                                   last  integer default null) return integer
    is
       first_ integer := first;
       last_  integer := last;
       rval   integer;
       mval   binary_double;
    begin 
       if (first_ is null or first_ < 0)         then first_ := 1;         end if;
       if (last_  is null or last_  > betaCount) then last_  := betaCount; end if;
       if (betaCount is null or first_ > last_)  then return null; end if;
       if (first_ = 0) then mval := alpha; else mval := betas(first_); end if;
       rval := first_;
       for s in first_ + 1 .. last_ loop
         if (betas(s) > mval) then mval := betas(s); rval := s; end if;
       end loop;
       return rval;
    end;

    member function getCoeffMinIdx(first integer default 1,
                                   last  integer default null) return integer
    is
       first_ integer := first;
       last_  integer := last;
       rval   integer;
       mval   binary_double;
    begin 
       if (first_ is null or first_ < 0)         then first_ := 1;         end if;
       if (last_  is null or last_  > betaCount) then last_  := betaCount; end if;
       if (betaCount is null or first_ > last_)  then return null; end if;
       if (first_ = 0) then mval := alpha; else mval := betas(first_); end if;
       rval := first_;
       for s in first_ + 1 .. last_ loop
         if (betas(s) < mval) then mval := betas(s); rval := s; end if;
       end loop;
       return rval;
    end;

    member function getCorrelation
    return number is
    begin
        return correlation;
    end;

    member function getError
    return number is
    begin
        return mse;
    end;
    
    static procedure unittest is
      ols OLS_Regression;
      tst varchar(2000);
    begin
 
     -- Test1: Verifies the model: y = 3 + x1 - 2*x2
      tst := 'Test 1';
      ols := OLS_Regression(5, 10, UTL_NLA_ARRAY_DBL(2, 0), UTL_NLA_ARRAY_DBL(7, 3, 2), UTL_NLA_ARRAY_DBL(1, -1));
      if (ols.getCoefficient(0) !=  3        or 
          ols.getCoefficient(1) !=  1        or 
          ols.getCoefficient(2) != -2        or 
          ols.getError          !=  7        or
          ols.getCoeffMaxIdx    !=  1        or
          ols.getCoeffMinIdx    !=  2        or
          ols.getCoeffSumK      !=  5        or
          ols.getCorrelation    != sqrt(0.3) or
          abs(ols.predict(UTL_NLA_ARRAY_DBL(1,2))) > 1.00E-10) then
         tst := tst || ': Coeff = [' || ols.getCoefficient(0) || ', '
                                     || ols.getCoefficient(1) || ', '
                                     || ols.getCoefficient(2) || ']' 
                    || ': MSE = '    || round(ols.getCorrelation, 3)
                    || ': R = '      || ols.getError
                    || ': MaxIdx = ' || ols.getCoeffMaxIdx 
                    || ': MinIdx = ' || ols.getCoeffMinIdx 
                    || ': SumK = '   || ols.getCoeffSumK;
         goto unit_test_failed;
      end if;

      -- Test2: Verifies the model: y = 10 - 5*x + 4*(x^2) - 3*(x^3) + 2*(x^4) - 1*(x^5)
      tst := 'Test 2';
      declare
        t    UTL_NLA_ARRAY_DBL := UTL_NLA_ARRAY_DBL();
      begin
        t.extend(least(100, t.limit));
        for i in 1 .. t.count loop t(i) := i - trunc(t.count/2); end loop;

        select OLS_Regression(avg(y), var_pop(y),
                              UTL_NLA_ARRAY_DBL(avg(x1),avg(x2),avg(x3),avg(x4),avg(x5)),
                              UTL_NLA_ARRAY_DBL(var_pop(x1),covar_pop(x1,x2),covar_pop(x1,x3),covar_pop(x1,x4),covar_pop(x1,x5),
                                                         var_pop(x2),covar_pop(x2,x3),covar_pop(x2,x4),covar_pop(x2,x5),
                                                                          var_pop(x3),covar_pop(x3,x4),covar_pop(x3,x5),
                                                                                           var_pop(x4),covar_pop(x4,x5),
                                                                                                            var_pop(x5)),
                              UTL_NLA_ARRAY_DBL(covar_pop(y,x1),covar_pop(y,x2),covar_pop(y,x3),covar_pop(y,x4),covar_pop(y,x5)))
        into ols
        from (select x1, x2, x3, x4, x5, 10 - (5*x1) + (4*x2) - (3*x3) + (2*x4) - (1*x5) y
              from (select power(column_value, 1) x1,
                           power(column_value, 2) x2,
                           power(column_value, 3) x3,
                           power(column_value, 4) x4,
                           power(column_value, 5) x5
                    from table(cast(t as UTL_NLA_ARRAY_DBL))));

        if (abs(ols.getCoefficient(0) - 10) > 1.00E-5  or 
            abs(ols.getCoefficient(1) +  5) > 1.00E-5  or 
            abs(ols.getCoefficient(2) -  4) > 1.00E-5  or 
            abs(ols.getCoefficient(3) +  3) > 1.00E-5  or 
            abs(ols.getCoefficient(4) -  2) > 1.00E-5  or 
            abs(ols.getCoefficient(5) +  1) > 1.00E-5  ) then
           tst := tst || ': Coeff = [' || ols.getCoefficient(0) || ', '
                                       || ols.getCoefficient(1) || ', '
                                       || ols.getCoefficient(2) || ', '
                                       || ols.getCoefficient(3) || ', '
                                       || ols.getCoefficient(4) || ', '
                                       || ols.getCoefficient(5) || ']'
                      || ': MSE = '    || ols.getCorrelation 
                      || ': R = '      || ols.getError;
           goto unit_test_failed;
        end if;
     end;

      -- Test3: Verifies the model: y = 0 + 2.07438016528926 x (with a forced 0 intercept)
      tst := 'Test 3';
      declare
        t    UTL_NLA_ARRAY_DBL := UTL_NLA_ARRAY_DBL();
      begin
        t.extend(least(11, t.limit));
        for i in 1 .. t.count loop t(i) := i-1; end loop;

        select OLS_Regression(avg(y), var_pop(y),
                              UTL_NLA_ARRAY_DBL(avg(x)),
                              UTL_NLA_ARRAY_DBL(var_pop(x)),
                              UTL_NLA_ARRAY_DBL(covar_pop(y, x)), 0)
        into ols
        from (select column_value + 130 y, column_value + 60 x
              from table(cast(t as UTL_NLA_ARRAY_DBL)));

        if (abs(ols.getCoefficient(0) - 0) > 1.00E-5  or 
            abs(ols.getCoefficient(1) -  2.07438016528926) > 1.00E-5) then
           tst := tst || ': Coeff = [' || ols.getCoefficient(0) || ', '
                                       || ols.getCoefficient(1) || ']'
                      || ': MSE = '    || ols.getCorrelation 
                      || ': R = '      || ols.getError;
           goto unit_test_failed;
        end if;
      end;

      tst := 'Test 4';
      ols := OLS_Regression(5, 10, UTL_NLA_ARRAY_DBL(2, 0), UTL_NLA_ARRAY_DBL(7, 3, 2), UTL_NLA_ARRAY_DBL(1, -1), 3);
      if (ols.getCoefficient(0) !=  3        or 
          ols.getCoefficient(1) !=  1        or 
          ols.getCoefficient(2) != -2        or 
          ols.getError          !=  7        or
          ols.getCoeffMaxIdx    !=  1        or
          ols.getCoeffMinIdx    !=  2        or
          ols.getCoeffSumK      !=  5        or
          ols.getCorrelation    != sqrt(0.3) or
          abs(ols.predict(UTL_NLA_ARRAY_DBL(1,2))) > 1.00E-10) then
         tst := tst || ': Coeff = [' || ols.getCoefficient(0) || ', '
                                     || ols.getCoefficient(1) || ', '
                                     || ols.getCoefficient(2) || ']' 
                    || ': MSE = '    || round(ols.getCorrelation, 3)
                    || ': R = '      || ols.getError
                    || ': MaxIdx = ' || ols.getCoeffMaxIdx 
                    || ': MinIdx = ' || ols.getCoeffMinIdx 
                    || ': SumK = '   || ols.getCoeffSumK;
         goto unit_test_failed;
      end if;

      return;

      <<unit_test_failed>>
      raise_application_error(-20100, 'OLS_Regression Unit Test Failed: ' || tst);
    end;

end;
/
show errors

create or replace public synonym OLS_Regression for OLS_Regression;
grant execute on OLS_Regression to public;

Rem
Rem Run the unittest
Rem

begin OLS_Regression.unittest; end;
/


